
rm(list = ls())

library(tidyverse)
library(readxl)
library(xlsx)
library(lubridate)

moscow_raw <- read_excel("/g_MoscowDataRaw.xlsx")


moscow_dta_temp <- moscow_raw %>%
  select(start_time = V8, end_time = V9, randomid = RandomID,
         matches("S1|S2", ignore.case = FALSE),
         matches("G1|G2|G3", ignore.case = FALSE),
         contains("lottery"), contains("PSM"), contains("big5"),
         matches("job_sect|job_pref|job_likely|job_qualities"),
         bribe_nonexp = dishonesty_4,
         age, male = gender, class_yr, contains("dep"),
         gpa = GPA, olympiad, contains("EGE"), region,
         city_size, contains("religion"), family_income,
         contains("relatives"), 
         soc_science_knowledg, device
  )




#PRELIMINARY VARIABLE CONSTRUCTION FOR DICE TASK GAME

#changing variables names of individual virtual dice rolls
#g1.1...g1.30 are the first set of 20 dice rolls (each respondent sees only 20 of these 30 dice profiles)
#g2.1...g2.30 are the second set of 20 dice rolls (each respondent sees only 20 of these 30 dice profiles)

names(moscow_dta_temp) = gsub(pattern = "\\(", replacement = "", x = names(moscow_dta_temp)) 
names(moscow_dta_temp) = gsub(pattern = "\\)", replacement = "", x = names(moscow_dta_temp)) 
names(moscow_dta_temp) = gsub(pattern = "G2v1_T", replacement = "g1.", x = names(moscow_dta_temp)) 
names(moscow_dta_temp) = gsub(pattern = "G2v2_T", replacement = "g2.", x = names(moscow_dta_temp)) 
moscow_dta_temp <- moscow_dta_temp %>% dplyr::rename(g1.25 = G2v1F_T1, g1.26 = G2v1F_T2,  g1.27 = G2v1F_T3,  g1.28 = G2v1F_T4, g1.29 = G2v1F_T5 , g1.30 = G2v1F_T6)
moscow_dta_temp <- moscow_dta_temp %>% dplyr::rename(g2.25 = G2v2F_T1, g2.26 = G2v2F_T2,  g2.27 = G2v2F_T3,  g2.28 = G2v2F_T4, g2.29 = G2v2F_T5 , g2.30 = G2v2F_T6)

# correct guesses are already coded as 1, incorrect guesses as NA; changing here NA to 0
#moscow_dta_temp <- moscow_dta_temp %>% mutate_at(vars(starts_with('g', ignore.case = F), -contains("gpa")), funs(replace_na(., 0)))
  #replaced the above line with the line below b/c "funs" deprecated
moscow_dta_temp <- moscow_dta_temp %>% mutate_at(vars(starts_with('g', ignore.case = F), -contains("gpa")), list(~replace_na(., 0)))

# correct_guesses1 is sum of correct guesses in first 20 dice rolls
dice_rolls1 <- moscow_dta_temp %>% select(starts_with('g1.', ignore.case = F))
moscow_dta_temp$correct_guesses1 <- rowSums(dice_rolls1)   

# correct_guesses2 is sum of correct guesses in second 20 dice rolls
dice_rolls2 <- moscow_dta_temp %>% select(starts_with('g2.', ignore.case = F))
moscow_dta_temp$correct_guesses2 <- rowSums(dice_rolls2)  


#PRELIMINARY VARIABLE CONSTRUCTION FOR BRIBERY GAME

moscow_dta_temp$bribe_accept_size1 = with(moscow_dta_temp,
                                          ifelse(G3_bur_br_350 > G3_bur_br_300, 7,
                                                 ifelse(G3_bur_br_300 > G3_bur_br_250, 6,
                                                        ifelse(G3_bur_br_250 > G3_bur_br_200, 5,
                                                               ifelse(G3_bur_br_200 > G3_bur_br_150, 4,
                                                                      ifelse(G3_bur_br_150 > G3_bur_br_100, 3,
                                                                             ifelse(G3_bur_br_100 > G3_bur_br_50, 2,
                                                                                    ifelse(G3_bur_br_50 > 0, 1, 0)))))))
)

#this version removes subjects with non-transitive preferences
moscow_dta_temp$bribe_accept_size2 = with(moscow_dta_temp,
                                          ifelse((G3_bur_br_50 <= G3_bur_br_100 & 
                                                    G3_bur_br_100 <= G3_bur_br_150 & 
                                                    G3_bur_br_150 <= G3_bur_br_200 & 
                                                    G3_bur_br_200 <= G3_bur_br_250 &
                                                    G3_bur_br_250 <= G3_bur_br_300 & 
                                                    G3_bur_br_300 <= G3_bur_br_350) == TRUE, bribe_accept_size1, NA)
)

#PRELIMINARY VARIABLE CONSTRUCTION FOR ACADEMIC DEPARTMENT

moscow_dta_temp <- moscow_dta_temp %>% mutate(dep_text = case_when(
                     grepl("бизнес|БИ|Логист|безопас|информ|сратег|менедж|бм|финансового|управлен", dep_TEXT, ignore.case = T) ~ "Bus",
                     grepl("между|миров|МЭ|МИЭФ", dep_TEXT, ignore.case = T) ~ "Intl",
                     grepl("экон|РЭШ|стат|финансов", dep_TEXT, ignore.case = T) ~ "Econ",
                     grepl("интел|комп|Програм|ФКН", dep_TEXT, ignore.case = T) ~ "CS",
                     grepl("ВШУ|урбан", dep_TEXT, ignore.case = T) ~ "Urban",
                     grepl("гуман|ФГН|истор|Линг|филос", dep_TEXT, ignore.case = T) ~ "Humanities",
                     grepl("матем|Мвтем|МИЭМ|ДПМ", dep_TEXT, ignore.case = T) ~ "Math",
                     grepl("комм|медиа|дизайн|журнал|реклам", dep_TEXT, ignore.case = T) ~ "Comm",
                     grepl("прав|юрис", dep_TEXT, ignore.case = T) ~ "Law",
                     grepl("Искусства|УиЭЗ|нет деп|-", dep_TEXT, ignore.case = T) ~ "Other")
                      )

#OTHER PRELIMINARY CLEANING

moscow_dta_temp <- moscow_dta_temp %>% mutate_at(vars(contains("_work_")), list(~replace_na(., 0)))



#CREATING DATA SET

moscow_dta <- moscow_dta_temp %>% transmute(
                  #DICTATOR GAME
                  donate = G1_donate_1, 
                  #converts to ruble amount
                  donate50 = donate*50,
                  #DICE TASK GAME
                  correct_guesses1, correct_guesses2,
                  correct_guesses = correct_guesses1 + correct_guesses2,
                  cheat_rate = (1/(1 - (1/6))) * 1/40 * correct_guesses - (1/6)/(1 - (1/6)),
                  #BRIBERY GAME
                  bribe_accept_size1,
                  bribe_accept_size2,
                  bribe_offer_size = G3_citizen_1,
                  bribe_offer = ifelse(bribe_offer_size > 0, 1, 0),
                  bribe_accept = ifelse(bribe_accept_size1 > 0, 1, 0),
                  bribe = ifelse(is.na(bribe_offer) == FALSE, bribe_offer, bribe_accept),
                  citizen_nobribe = G3_citizen_nobribe,
                  bur_nobribe = G3_bur_nobribe,
                  #RISK AVERSION MEASURE
                  risk_averse1 = 8 - (lottery_1 + lottery_2 + lottery_3 + lottery_4 + lottery_5 + lottery_6 + lottery_7),
                  #this version removes subjects with non-transitive preferences
                  risk_averse2 = ifelse((lottery_1 <= lottery_2 & 
                                           lottery_2 <= lottery_3 & 
                                           lottery_3 <= lottery_4 &
                                           lottery_4 <= lottery_5 &
                                           lottery_5 <= lottery_6 & 
                                           lottery_6 <= lottery_7) == TRUE, risk_averse1, NA),
                  #CAREER PREFERENCE VARIABLES
                  job_sect,
                  fed_pref = job_pref_1,
                  reg_pref = job_pref_2,
                  corp_pref = job_pref_3,
                  sme_pref = job_pref_4,
                  owner_pref = job_pref_5,
                  finance_pref = job_pref_6,
                  consult_pref = job_pref_7,
                  ngo_pref = job_pref_8,
                  gov_pref = job_pref_9,
                  fed_exp = job_likely_1,
                  reg_exp = job_likely_2,
                  corp_exp = job_likely_3,
                  sme_exp = job_likely_4,
                  owner_exp = job_likely_5,
                  finance_exp = job_likely_6,
                  consult_exp = job_likely_7,
                  ngo_exp = job_likely_8,
                  gov_exp = job_likely_9,
                  pub_avg = (fed_pref + reg_pref + gov_pref)/3,
                  priv_avg = (corp_pref + finance_pref + sme_pref + consult_pref + owner_pref)/5,
                  #JOB ATTRIBUTES
                  job_stable = job_qualities_1,
                  job_income = job_qualities_2,
                  job_bens = job_qualities_3,
                  job_promo = job_qualities_4,
                  job_interest = job_qualities_5,
                  job_altruism = job_qualities_6,
                  job_social = job_qualities_7,
                  job_schedule = job_qualities_8,
                  job_connect = job_qualities_9,
                  job_prestige = job_qualities_10,
                  job_intrinsic = (job_social + job_altruism + job_interest)/3,
                  job_extrinsic = (job_income + job_promo + job_connect + job_prestige)/4,
                  job_pragmatic = (job_stable + job_bens + job_schedule)/3,
                  #JUSTIFYING BRIBERY
                  bribe_nonexp, 
                  bribe_nonexp_dich = ifelse(bribe_nonexp < 3, 0, 1),
                  #PSM
                  aps1 = PSM_1, aps2 = PSM_2, aps3 = PSM_3, aps4 = PSM_4,
                  cpv1 = PSM_5, cpv2 = PSM_6, cpv3 = PSM_7, cpv4 = PSM_8,
                  com1 = PSM_9, com2 = PSM_10, com3 = PSM_11, com4 = PSM_12,
                  ss1 = PSM_13, ss2 = PSM_14, ss3 = PSM_15, ss4 = PSM_16,
                  aps = (aps1 + aps2 + aps3 + aps4)/4,
                  cpv = (cpv1 + cpv2 + cpv3 + cpv4)/4,
                  com = (com1 + com2 + com3 + com4)/4,
                  ss = (ss1 + ss2 + ss3 + ss4)/4,
                  psm = (aps + cpv + com + ss)/4,
                  #CONTROL VARIABLES
                  age, male, olympiad, gpa, family_income,
                  city_size, religion, religion_services,
                  religion_denom = recode(religion_denom, `1` = "Orthodox Christianity", `2` =  "Protestantism",
                                          `3` = "Catholicism", `4` = "Islam", `5` = "Judaism", `6` = "Buddhism",
                                          `7` = "Other"),
                  ege_taken = EGE_taken,
                  ege_math = as.numeric(EGE_3_TEXT),
                  ege_ru = na_if(EGE_10_TEXT, "-"),
                  ege_ru = as.numeric(ege_ru),
                  ege_avg = (ege_math + ege_ru)/2,
                  region1 = recode(region, `1` = "Moscow", `44` = "Moscow", `99` = "Foreign", .default = "Regions"),
                  region2 = plyr::mapvalues(region, 
                                            from = c(3, 8, 14, 25, 34, 57, #southern
                                                     1, 10, 11, 13, 16, 20, 26, 38, 40, 44, 52, 58, 66, 68, 70, 72, 85,#central
                                                     9, 31, 43, 47, 51, 53, 54, 59, 60, 75, 76, 82, 69,#volga
                                                     65, 67, #n caucasus
                                                     36, #crimea
                                                     2, 7, 15, 24, 29, 32, 48, #northwest
                                                     63, 74, 79, 80, 84, #ural
                                                     5, 12, 22, 30, 35, 49, 50, 73, 78, #siberia
                                                     6, 27, 41, 55, 62, 77, 61, #far east
                                                     99),#foreign 
                                            to = c(rep("Southern", 6), rep("Central", 17), rep("Volga", 13), rep("N. Caucasus", 2),
                                                   rep("Crimea", 1), rep("Northwest", 7), rep("Ural", 5), rep("Siberia", 9), 
                                                   rep("Far East", 7), rep("Foreign", 1))),
                  f_work_fed = relatives_work_1_1, m_work_fed = relatives_work_1_2, r_work_fed = relatives_work_1_3,
                  f_work_reg = relatives_work_2_1, m_work_reg = relatives_work_2_2, r_work_reg = relatives_work_2_3,
                  f_work_gov = relatives_work_3_1, m_work_gov = relatives_work_3_2, r_work_gov = relatives_work_3_3,
                  f_work_corp = relatives_work_4_1, m_work_corp = relatives_work_4_2, r_work_corp = relatives_work_4_3,
                  f_work_sme = relatives_work_5_1, m_work_sme = relatives_work_5_2, r_work_sme = relatives_work_5_3,
                  f_work_owner = relatives_work_6_1, m_work_owner = relatives_work_6_2, r_work_owner = relatives_work_6_3,
                  f_work_finance = relatives_work_7_1, m_work_finance = relatives_work_7_2, r_work_finance = relatives_work_7_3,
                  f_work_ngo = relatives_work_8_1, m_work_ngo = relatives_work_8_2, r_work_ngo = relatives_work_8_3,
                  f_work_law = relatives_work_9_1, m_work_law = relatives_work_9_2, r_work_law = relatives_work_9_3,
                  f_work_mil = relatives_work_10_1, m_work_mil = relatives_work_10_2, r_work_mil = relatives_work_10_3,
                  f_work_other = relatives_work_11_1, m_work_other = relatives_work_11_2, r_work_other = relatives_work_11_3,
                  f_no_work = relatives_work_12_1, m_no_work = relatives_work_12_2, r_no_work = relatives_work_12_3,
                  f_work_na = relatives_work_13_1, m_work_na = relatives_work_13_2, r_work_na = relatives_work_13_3,
                  f_pubsect = ifelse(f_work_fed == 1 | f_work_reg == 1 | f_work_gov == 1, 1, 0),
                  m_pubsect = ifelse(m_work_fed == 1 | m_work_reg == 1 | m_work_gov == 1, 1, 0),
                  r_pubsect = ifelse(r_work_fed == 1 | r_work_reg == 1 | r_work_gov == 1, 1, 0),
                  rel_pubsect = ifelse(f_pubsect == 1 | m_pubsect == 1 | r_pubsect == 1, 1, 0),
                  #collapsing all MA students together
                  class_yr = recode(class_yr, `6` = 5),
                  #integrating write-in info on departments
                  dep = ifelse(dep == 5 & is.na(dep_text) == FALSE, dep_text, dep),
                  dep = recode_factor(dep, `1` = "PA", `2` = "PS", `3` = "Soc", `4` = "Psych", `5` = "Other"),
                  dep2 = recode_factor(dep, "Law" = "Other", "Humanities" = "Other", "CS" = "Other", "Math" = "Other", "Intl" = "Other", "Urban" = "Other", "Psych" = "Other"),
                  #QUALITY CHECK VARIABLES
                  randomid, soc_sci_knowledge = soc_science_knowledg, device,
                  #time to complete
                  duration = round(difftime(end_time, start_time, units=c("auto","secs","mins","hours","days","weeks")), 0),
                  #screeners
                  screener1 = ifelse(is.na(S1_27) == F & is.na(S1_29) == F, 1, 0),
                  screener2 = ifelse(is.na(S2_5) == F & is.na(S2_10) == F, 1, 0),
                  screener = screener1 + screener2
                  )


##MERGING FOLLOW UP SURVEYS

moscow17raw <- read_excel("/h_MoscowFollowUp2017.xlsx")
moscow18raw <- read_excel("/i_MoscowFollowUp2018.xlsx")

moscow18 <- moscow18raw %>% select(Q1, RandomID)
moscow17 <- moscow17raw %>% select(Q1, RandomID)

moscow18 <- moscow18 %>% rename(randomid = "RandomID", career18 = "Q1")
moscow17 <- moscow17 %>% rename(randomid = "RandomID", career17 = "Q1")

#checking for duplicate participant IDs
moscow_dta %>% count(randomid) %>% filter(n > 1)
#these two IDs have duplicates 29566, 80655    

moscow18 %>% count(randomid) %>% filter(n > 1)
moscow17 %>% count(randomid) %>% filter(n > 1)

#checking for IDs duplicated in original in follow up surveys
moscow18 %>% filter(randomid == 29566, randomid == 80655)
moscow17 %>% filter(randomid == 29566, randomid == 80655)
#neither of the problematic RandomIDs are in the follow up sample

#merging
moscow_dta_clean_temp <- moscow_dta %>% left_join(moscow18, by = "randomid")  
moscow_dta_clean_temp <- moscow_dta_clean_temp %>% left_join(moscow17, by = "randomid")  



MoscowClean <- moscow_dta_clean_temp %>% mutate(
                                  #combining 2018 and 2017 data to use the latest type of employment
                                  career = ifelse(is.na(career18) == T, career17, career18),
                                  #removing participants who report post-graduate studies or not working
                                  career_ifwork = ifelse(career %in% c(10, 12), NA, career),
                                  #creating variable for whether employed in any public sector position
                                  pub = ifelse(career_ifwork == 1 | career_ifwork == 2 | career_ifwork == 3, 1, 0), 
                                  #creating variable for whether employed in any public sector position or ngo
                                  pub_ngo = ifelse(career_ifwork == 1 | career_ifwork == 2 | career_ifwork == 3 | career_ifwork == 9, 1, 0),
                                  #creating variable for whether employed in budget sector
                                  gov = ifelse(career_ifwork == 3, 1, 0),
                                  #creating similar variables for 2017 and 2018 follow up surveys separately
                                  career_ifwork18 = ifelse(career18 %in% c(10, 12), NA, career18),
                                  pub18 = ifelse(career_ifwork18 == 1 | career_ifwork18 == 2 | career_ifwork18 == 3, 1, 0), 
                                  pub_ngo18 = ifelse(career_ifwork18 == 1 | career_ifwork18 == 2 | career_ifwork18 == 3 | career_ifwork18 == 9, 1, 0),
                                  gov18 = ifelse(career_ifwork18 == 3, 1, 0),
                                  career_ifwork17 = ifelse(career17 %in% c(10, 12), NA, career17),
                                  pub17 = ifelse(career_ifwork17 == 1 | career_ifwork17 == 2 | career_ifwork17 == 3, 1, 0), 
                                  pub_ngo17 = ifelse(career_ifwork17 == 1 | career_ifwork17 == 2 | career_ifwork17 == 3 | career_ifwork17 == 9, 1, 0),
                                  gov17 = ifelse(career_ifwork17 == 3, 1, 0)
                                  )
#converting tibble to dataframe
MoscowClean_DF <- as.data.frame(MoscowClean)

#write.csv(MoscowClean_DF, "MoscowClean.csv", na = "", row.names = F)


